library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
feature_description_original <- readxl::read_excel(
"data/feature_description.xlsx")
feature_description_original
customer_segmentation_raw <- read_csv2(
"data/customer_segmentation_test.csv",
col_types = list(col_character(), col_character(), col_character(), col_character(),
col_double(), col_double(), col_character(), col_double(), col_double(),
col_character(), col_double(), col_double(), col_character(), col_double(),
col_double(), col_character(), col_double(), col_double(), col_character(),
col_character(), col_character()),
guess_max = 400000
) %>% mutate(
`Date of Birth` = lubridate::dmy(`Date of Birth`),
Gender = as.factor(Gender),
MERCHANDISE2015 = as.factor(MERCHANDISE2015),
MERCHANDISE2016 = as.factor(MERCHANDISE2016),
MERCHANDISE2017 = as.factor(MERCHANDIESE2017),
MERCHANDISE2018 = as.factor(MERCHANDIESE2018),
MERCHANDISE2019 = as.factor(MERCHANDISE2019),
LastPaymentDate = lubridate::dmy(LastPaymentDate),
PenultimatePaymentDate = lubridate::dmy(PenultimatePaymentDate)
) %>% select(-c(MERCHANDIESE2017, MERCHANDIESE2018)) %>%
rename(DateOfBirth = `Date of Birth`,
ID =`Customer Number`)
## i Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
skimr::skim(customer_segmentation_raw)
| Name | customer_segmentation_raw |
| Number of rows | 406734 |
| Number of columns | 21 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| Date | 3 |
| factor | 6 |
| numeric | 10 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ID | 0 | 1.00 | 10 | 10 | 0 | 406734 | 0 |
| Postcode | 9176 | 0.98 | 1 | 9 | 0 | 2982 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| DateOfBirth | 155491 | 0.62 | 1902-04-21 | 2015-03-30 | 1948-03-09 | 25514 |
| LastPaymentDate | 0 | 1.00 | 2015-01-03 | 2020-02-13 | 2018-12-06 | 1361 |
| PenultimatePaymentDate | 44699 | 0.89 | 1995-12-31 | 2020-02-05 | 2017-04-12 | 5376 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Gender | 0 | 1 | FALSE | 3 | fem: 203904, mal: 183467, fam: 19363 |
| MERCHANDISE2015 | 0 | 1 | FALSE | 2 | 0: 401845, 1: 4889 |
| MERCHANDISE2016 | 0 | 1 | FALSE | 2 | 0: 401585, 1: 5149 |
| MERCHANDISE2019 | 0 | 1 | FALSE | 2 | 0: 401470, 1: 5264 |
| MERCHANDISE2017 | 0 | 1 | FALSE | 2 | 0: 402378, 1: 4356 |
| MERCHANDISE2018 | 0 | 1 | FALSE | 2 | 0: 401470, 1: 5264 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| COUNT2015 | 0 | 1 | 2.52 | 4.00 | 0 | 0 | 2 | 2 | 96.0 | ▇▁▁▁▁ |
| SUM2015 | 0 | 1 | 42.44 | 850.19 | 0 | 0 | 15 | 45 | 388113.6 | ▇▁▁▁▁ |
| COUNT2016 | 0 | 1 | 1.22 | 2.02 | 0 | 0 | 1 | 1 | 178.0 | ▇▁▁▁▁ |
| SUM2016 | 0 | 1 | 50.93 | 591.05 | 0 | 0 | 16 | 50 | 295599.8 | ▇▁▁▁▁ |
| COUNT2017 | 0 | 1 | 1.06 | 1.91 | 0 | 0 | 0 | 1 | 95.0 | ▇▁▁▁▁ |
| SUM2017 | 0 | 1 | 24.78 | 572.90 | 0 | 0 | 0 | 20 | 207134.7 | ▇▁▁▁▁ |
| COUNT2018 | 0 | 1 | 1.00 | 1.87 | 0 | 0 | 0 | 1 | 49.0 | ▇▁▁▁▁ |
| SUM2018 | 0 | 1 | 20.64 | 1552.60 | 0 | 0 | 0 | 15 | 911146.5 | ▇▁▁▁▁ |
| COUNT2019 | 0 | 1 | 0.97 | 1.79 | 0 | 0 | 0 | 1 | 31.0 | ▇▁▁▁▁ |
| SUM2019 | 0 | 1 | 46.44 | 3999.80 | 0 | 0 | 0 | 30 | 2400000.0 | ▇▁▁▁▁ |
Bin hier sehr offen für Verbesserungsvorschläge ^^
zip_code_list <- readxl::read_excel("data/PLZ_Verzeichnis-20211201.xls")
zip_code_list
customer_segmentation_with_zip <- customer_segmentation_raw %>%
left_join(zip_code_list, by = c("Postcode" = "PLZ")) %>%
select(-c(`gültig ab`, `gültig bis`, NamePLZTyp, intern_extern, adressierbar, Postfach)) %>%
drop_na(Postcode, Ort, Bundesland) %>%
mutate(Postcode = as.factor(Postcode),
Bundesland = as.factor(Bundesland))
customer_segmentation_with_zip
# here we define, which months should be understood as "christmas months" to define "XMAS_donation"
XMAS_months = c(11,
12,
1)
# this date will be used as the reference for this analysis
reference_date <- lubridate::ymd("2021-12-17")
customer_segmentation_first_prepro <- customer_segmentation_with_zip %>%
mutate(
# year of customer's birthday
year_born = lubridate::year(DateOfBirth),
# age of donors at their last donation
age_at_last_donation = lubridate::interval(DateOfBirth, LastPaymentDate) %>%
as.numeric("years") %>%
as.integer(),
generation_moniker = case_when(
year_born <= 1945 ~ "silent" ,
year_born <= 1964 ~ "boomer",
year_born <= 1980 ~ "x",
year_born <= 1996 ~ "millennial",
year_born <= 2012 ~ "z"
) %>% as_factor(),
# total number of donations over all years
COUNTtotal = COUNT2015+
COUNT2016+
COUNT2017+
COUNT2018+
COUNT2019,
# total donation amount over all years
SUMtotal = SUM2015+
SUM2016+
SUM2017+
SUM2018+
SUM2019,
# average donation amount
SUMaverage = SUMtotal / COUNTtotal,
# month of the last payment
LastPaymentMONTH = lubridate::month(LastPaymentDate) %>% as.factor(),
# month of second to last payment
PenultimatePaymentMONTH = lubridate::month(PenultimatePaymentDate) %>% as.factor(),
# year of the last payment
LastPaymentYEAR = lubridate::year(LastPaymentDate),
# year of second to last payment
PenultimatePaymentYEAR = lubridate::year(PenultimatePaymentDate),
# THIS ONE NEEDS WORK
# status as christmas donor if the last two payments were around christmas,
# but we have to tweak the time interval (is Nov to Jan too large?)
# also: what about people that only have one payment in total, that should be considered. The "maybe" status is shady at best
XMAS_donor = as_factor(case_when(LastPaymentMONTH %in% XMAS_months & PenultimatePaymentMONTH %in% XMAS_months ~ "yes",
LastPaymentMONTH %in% XMAS_months ~ "maybe",
TRUE ~ "unlikely")),
# days between last and second to last payment
donation_interval = lubridate::day(lubridate::days(LastPaymentDate - PenultimatePaymentDate)),
# days since the last payment in relation to our reference date
days_since_last_payment = as.integer(LastPaymentDate - reference_date),
# binary factor variable expressing if any merchandise was bought over the observation period (clumsily coded)
merchandise_any = as_factor(if_else(
!is.na(MERCHANDISE2015) & MERCHANDISE2015 != 0 |
!is.na(MERCHANDISE2016) & MERCHANDISE2016 != 0 |
!is.na(MERCHANDISE2017) & MERCHANDISE2017 != 0 |
!is.na(MERCHANDISE2018) & MERCHANDISE2018 != 0 |
!is.na(MERCHANDISE2019) & MERCHANDISE2019 != 0,
1,
0))) %>%
# grouping for the next mutation (num_of_donation_years)
group_by(ID) %>%
# number of years in which anything was donated (0-5)
mutate(num_of_donation_years = sum(COUNT2015 > 0,
COUNT2016 > 0,
COUNT2017 > 0,
COUNT2018 > 0,
COUNT2019 > 0, na.rm=T)) %>%
# ungrouping is important! ;)
# I learned that skimr tries to show its output based on groups if working with a grouped dataset... that crashed my computer twice ^^
ungroup() %>%
# remove variables that have no further use or
select(-c(ID, DateOfBirth, LastPaymentDate, PenultimatePaymentDate))
customer_segmentation_first_prepro
customer_segmentation_first_prepro %>% skimr::skim()
| Name | Piped data |
| Number of rows | 396694 |
| Number of columns | 34 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| factor | 13 |
| numeric | 20 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Ort | 0 | 1 | 2 | 40 | 0 | 2178 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Gender | 0 | 1.00 | FALSE | 3 | fem: 199545, mal: 179215, fam: 17934 |
| Postcode | 0 | 1.00 | FALSE | 2249 | 122: 6776, 121: 6208, 110: 5941, 502: 5383 |
| MERCHANDISE2015 | 0 | 1.00 | FALSE | 2 | 0: 391818, 1: 4876 |
| MERCHANDISE2016 | 0 | 1.00 | FALSE | 2 | 0: 391552, 1: 5142 |
| MERCHANDISE2019 | 0 | 1.00 | FALSE | 2 | 0: 391460, 1: 5234 |
| MERCHANDISE2017 | 0 | 1.00 | FALSE | 2 | 0: 392339, 1: 4355 |
| MERCHANDISE2018 | 0 | 1.00 | FALSE | 2 | 0: 391460, 1: 5234 |
| Bundesland | 0 | 1.00 | FALSE | 9 | N: 88175, W: 70706, O: 66082, St: 57348 |
| generation_moniker | 146208 | 0.63 | FALSE | 5 | sil: 110508, boo: 102068, x: 33020, mil: 4734 |
| LastPaymentMONTH | 0 | 1.00 | FALSE | 12 | 12: 119035, 11: 66379, 1: 45775, 10: 42275 |
| PenultimatePaymentMONTH | 37875 | 0.90 | FALSE | 12 | 12: 91203, 11: 56900, 10: 42674, 1: 27463 |
| XMAS_donor | 0 | 1.00 | FALSE | 3 | unl: 165505, may: 119746, yes: 111443 |
| merchandise_any | 0 | 1.00 | FALSE | 2 | 0: 377620, 1: 19074 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| COUNT2015 | 0 | 1.00 | 2.56 | 4.03 | 0.00 | 0.00 | 2.00 | 4.00 | 96.0 | ▇▁▁▁▁ |
| SUM2015 | 0 | 1.00 | 41.12 | 724.36 | 0.00 | 0.00 | 15.00 | 45.00 | 388113.6 | ▇▁▁▁▁ |
| COUNT2016 | 0 | 1.00 | 1.24 | 2.03 | 0.00 | 0.00 | 1.00 | 1.00 | 178.0 | ▇▁▁▁▁ |
| SUM2016 | 0 | 1.00 | 51.20 | 596.95 | 0.00 | 0.00 | 20.00 | 50.00 | 295599.8 | ▇▁▁▁▁ |
| COUNT2017 | 0 | 1.00 | 1.08 | 1.92 | 0.00 | 0.00 | 0.00 | 1.00 | 95.0 | ▇▁▁▁▁ |
| SUM2017 | 0 | 1.00 | 24.45 | 484.85 | 0.00 | 0.00 | 0.00 | 20.00 | 207134.7 | ▇▁▁▁▁ |
| COUNT2018 | 0 | 1.00 | 1.02 | 1.88 | 0.00 | 0.00 | 0.00 | 1.00 | 49.0 | ▇▁▁▁▁ |
| SUM2018 | 0 | 1.00 | 20.76 | 1570.91 | 0.00 | 0.00 | 0.00 | 15.00 | 911146.5 | ▇▁▁▁▁ |
| COUNT2019 | 0 | 1.00 | 0.98 | 1.80 | 0.00 | 0.00 | 0.00 | 1.00 | 31.0 | ▇▁▁▁▁ |
| SUM2019 | 0 | 1.00 | 46.90 | 4049.95 | 0.00 | 0.00 | 0.00 | 30.00 | 2400000.0 | ▇▁▁▁▁ |
| year_born | 146204 | 0.63 | 1949.25 | 14.01 | 1902.00 | 1939.00 | 1948.00 | 1959.00 | 2015.0 | ▁▇▇▂▁ |
| age_at_last_donation | 146204 | 0.63 | 68.33 | 14.00 | 0.00 | 59.00 | 70.00 | 79.00 | 117.0 | ▁▁▇▇▁ |
| COUNTtotal | 0 | 1.00 | 6.87 | 9.93 | 1.00 | 2.00 | 3.00 | 7.00 | 273.0 | ▇▁▁▁▁ |
| SUMtotal | 0 | 1.00 | 184.43 | 4898.70 | 0.01 | 30.00 | 65.00 | 160.00 | 2400225.0 | ▇▁▁▁▁ |
| SUMaverage | 0 | 1.00 | 36.08 | 1530.61 | 0.01 | 11.25 | 17.34 | 29.42 | 750000.0 | ▇▁▁▁▁ |
| LastPaymentYEAR | 0 | 1.00 | 2017.78 | 1.53 | 2015.00 | 2016.00 | 2018.00 | 2019.00 | 2020.0 | ▅▂▃▇▂ |
| PenultimatePaymentYEAR | 37875 | 0.90 | 2015.72 | 3.91 | 1995.00 | 2015.00 | 2017.00 | 2018.00 | 2020.0 | ▁▁▁▃▇ |
| donation_interval | 37875 | 0.90 | 773.66 | 1215.88 | 1.00 | 123.00 | 354.00 | 762.00 | 8766.0 | ▇▁▁▁▁ |
| days_since_last_payment | 0 | 1.00 | -1293.24 | 561.24 | -2540.00 | -1814.00 | -1102.00 | -762.00 | -673.0 | ▂▂▂▃▇ |
| num_of_donation_years | 0 | 1.00 | 2.50 | 1.49 | 1.00 | 1.00 | 2.00 | 4.00 | 5.0 | ▇▅▃▂▃ |
#Maybe it's a good idea to take out all the NAs for age. Obviously we lose a lot of rows, but 251000 left still seems plenty to me.
customer_segmentation_complete <- customer_segmentation_first_prepro %>% drop_na(year_born)
customer_segmentation_complete
ggplot(customer_segmentation_first_prepro, aes(XMAS_donor)) +
geom_bar() +
facet_wrap(~Gender)
ggplot(customer_segmentation_first_prepro, aes(num_of_donation_years)) +
geom_bar() +
facet_wrap(~generation_moniker)
ggplot(customer_segmentation_first_prepro %>% drop_na(age_at_last_donation), aes(age_at_last_donation)) +
geom_histogram(binwidth = 5)
ggplot(customer_segmentation_first_prepro %>% filter(SUMtotal > 0 & SUMtotal < 5000), aes(x = SUMtotal)) +
geom_histogram(binwidth = 100) +
facet_wrap(~Gender)
ggplot(customer_segmentation_first_prepro, aes(LastPaymentMONTH)) +
geom_bar() +
facet_wrap(~Gender)
ggplot(customer_segmentation_first_prepro, aes(PenultimatePaymentMONTH)) +
geom_bar() +
facet_wrap(~Gender)
ggplot(customer_segmentation_first_prepro %>% filter(COUNTtotal < (7 * 6)), aes(COUNTtotal)) +
geom_histogram(binwidth = 1)
ggplot(customer_segmentation_first_prepro %>% drop_na(donation_interval) %>% filter(donation_interval < (360 * 5)), aes(donation_interval)) +
geom_histogram(binwidth = 30)
mean_total_sum <- customer_segmentation_first_prepro$SUMtotal %>% mean(na.rm = TRUE)
sd_total_sum <- customer_segmentation_first_prepro$SUMtotal %>% sd(na.rm = TRUE)
ggplot(customer_segmentation_first_prepro %>% drop_na(year_born) %>% filter(SUMtotal < (mean_total_sum + sd_total_sum * 6)), aes(year_born, SUMtotal)) +
geom_point(alpha = 1 / 10)
# taken from https://de.statista.com/statistik/daten/studie/75396/umfrage/entwicklung-der-bevoelkerung-in-oesterreich-nach-bundesland-seit-1996/
pop_vienna <- 1921153
pop_lower_austria <- 1691040
pop_upper_austria <- 1495756
pop_styria <- 1247159
pop_tyrol <- 760161
pop_carithia <- 562230
pop_salzburg <- 560643
pop_vorarlberg <- 399164
pop_burgenland <- 296040
donors_per_state_per_100_000_inhabitants <- customer_segmentation_first_prepro %>%
select(Bundesland) %>%
group_by(Bundesland) %>%
count() %>%
ungroup() %>%
mutate(
n = case_when(
Bundesland == "B" ~ n / pop_burgenland * 100000,
Bundesland == "K" ~ n / pop_carithia * 100000,
Bundesland == "N" ~ n / pop_lower_austria * 100000,
Bundesland == "O" ~ n / pop_upper_austria * 100000,
Bundesland == "Sa" ~ n / pop_salzburg * 100000,
Bundesland == "St" ~ n / pop_styria * 100000,
Bundesland == "T" ~ n / pop_tyrol * 100000,
Bundesland == "V" ~ n / pop_vorarlberg * 100000,
Bundesland == "W" ~ n / pop_vienna * 100000
)
)
ggplot(donors_per_state_per_100_000_inhabitants, aes(Bundesland, n)) +
geom_col()
ggplot(customer_segmentation_first_prepro, aes(days_since_last_payment)) +
geom_histogram(binwidth = 30)
RFM segments customers according to three variabless: Recency, Frequency, Monetary Value. Using the rfm package, RFM scores can be computed either on raw transaction data (one row per transaction), or on aggregated customer data (one row per customer). For the former, the method rfm_table_order can be used, for the latter either rfm_table_customer or rfm_table_customer2. Since our dataset represents aggregated customer data, the latter should be used. It can be computed directly from the raw data upon adding the two variables SUMtotal and COUNTtotal:
library(rfm)
rfm_scores <- customer_segmentation_raw %>%
# create new variables: total donation sum; total number of donations
mutate(SUMtotal = SUM2015 + SUM2016 + SUM2017 + SUM2018 + SUM2019,
COUNTtotal = COUNT2015 + COUNT2016 + COUNT2017 + COUNT2018 + COUNT2019,
LastPaymentDate = as.Date(LastPaymentDate)) %>%
# compute RFM scores
rfm_table_customer_2(customer_id = ID,
n_transactions = COUNTtotal,
latest_visit_date = LastPaymentDate,
total_revenue = SUMtotal,
analysis_date = reference_date)
rfm_scores
## Warning in `[<-.data.frame`(`*tmp*`, is_list, value = list(`1` =
## "<tibble[,8]>", : replacement element 1 has 1 row to replace 0 rows
## Warning in `[<-.data.frame`(`*tmp*`, is_list, value = list(`1` =
## "<tibble[,8]>", : replacement element 2 has 1 row to replace 0 rows
Visual inspection of RFM scores:
rfm_heatmap(rfm_scores)
In the above heatmap, we can see some interesting patterns (Note: The higher the recency score, the more recent the last donation):
There are further, less obvious customer segments in the heatmap. For the sake of clarity, rather than verbally describing the segments, below we visually represent the customer segments we believe to have identified in the heatmap:
# define data frame with frequency and recency score thresholds for each segment
heatmap_segments_df <- data.frame(x = c(1, 3, 4.5, 0.5, 0.5, 2, 4),
y = c(1.5, 1.5, 1.5, 3.5, 4.5, 4, 4),
lab = c("Lost", "Loyal average donor at risk", "Don't lose",
"Newbie", "Prospects", "Loyal average donor active",
"Champ"))
# plot the customer segments
ggplot(heatmap_segments_df, aes(x, y, label = lab)) +
geom_rect(aes(xmin = 0, xmax = 2, ymin = 0, ymax = 3), fill = "red", alpha = 0.1) +
geom_rect(aes(xmin = 2, xmax = 4, ymin = 0, ymax = 3), fill = "blue", alpha = 0.1) +
geom_rect(aes(xmin = 4, xmax = 5, ymin = 0, ymax = 3), fill = "green", alpha = 0.1) +
geom_rect(aes(xmin = 0, xmax = 1, ymin = 3, ymax = 4), fill = "tomato", alpha = 0.1) +
geom_rect(aes(xmin = 0, xmax = 1, ymin = 4, ymax = 5), fill = "yellow", alpha = 0.1) +
geom_rect(aes(xmin = 1, xmax = 3, ymin = 3, ymax = 5), fill = "orange", alpha = 0.1) +
geom_rect(aes(xmin = 0, xmax = 1, ymin = 4, ymax = 5), fill = "cyan", alpha = 0.1) +
geom_rect(aes(xmin = 3, xmax = 5, ymin = 3, ymax = 5), fill = "magenta", alpha = 0.1) +
geom_text()
The rfm_segment method can be used to assign donors to a given segment based on their RFM scores. To this end, the upper and lower bounds of recency, frequency and monetary scores for each segment, as well as the respective segment names, need to be defined. However, the code below throws an error, so probably there is a bug in the definition of the lower/upper segment bounds. ToDo: Fix the bug, or remove this.
As an alternative to rfm_segment, segments can be assigned to donors with the help of hand-crafted if-else-rules. However, this segmentation is not useful, because it yields a very high number of donors belonging to the other segment (approx. 25%). The reason for this is probably the aforementioned error in the definition of the upper/lower segment bounds.
rfm_segments <- rfm_scores$rfm %>%
mutate(segment = ifelse(recency_score %in% 4:5 & frequency_score %in% 4:5 & monetary_score %in% 4:5,
"Champ",
ifelse(recency_score %in% 4:5 & frequency_score %in% 2:3 & monetary_score %in% 1:3,
"Regular avg active",
ifelse(recency_score %in% 5:5 & frequency_score %in% 1:1 & monetary_score %in% 4:5,
"Prospect",
ifelse(recency_score %in% 4:4 & frequency_score %in% 1:1 & monetary_score %in% 1:3,
"Newbie",
ifelse(recency_score %in% 1:3 & frequency_score %in% 5:5 & monetary_score %in% 4:5,
"Don't loose",
ifelse(recency_score %in% 1:3 & frequency_score %in% 3:4 & monetary_score %in% 3:4,
"Regular avg at risk",
ifelse(recency_score %in% 1:3 & frequency_score %in% 1:2 & monetary_score %in% 1:2,
"Lost", "Other"))))))))
rfm_segments %>% ggplot(aes(segment)) +
geom_bar()
rfm_segments$segment %>% table() %>% prop.table() %>% round(3) %>% sort(decreasing = T)
## .
## Lost Other Champ Regular avg at risk
## 0.267 0.250 0.211 0.133
## Regular avg active Don't loose Newbie Prospect
## 0.068 0.048 0.021 0.002
To remedy the faulty segmentation shown above, we resort to the customer segments (and the respective RFM score thresholds) presented in class (see slide deck of first class, p. 82 as well as https://rpubs.com/Eddie_Zaldivar/705462). We use this mainstream segmentation as our baseline:
# define name of each segment
segment_names_baseline <- c("Champions", "Loyal Customers", "Potential Loyalist",
"New Customers", "Promising", "Need Attention", "About To Sleep",
"At Risk", "Can't Lose Them", "Lost")
# set the upper and lower bounds for recency, frequency, and monetary for each segment
recency_lower <- c(4, 2, 3, 4, 3, 2, 2, 1, 1, 1)
recency_upper <- c(5, 5, 5, 5, 4, 3, 3, 2, 1, 2)
frequency_lower <- c(4, 3, 1, 1, 1, 2, 1, 2, 4, 1)
frequency_upper <- c(5, 5, 3, 1, 1, 3, 2, 5, 5, 2)
monetary_lower <- c(4, 3, 1, 1, 1, 2, 1, 2, 4, 1)
monetary_upper <- c(5, 5, 3, 1, 1, 3, 2, 5, 5, 2)
# assign segment to each customer
rfm_segments_baseline <- rfm_segment(rfm_scores,
segment_names_baseline,
recency_lower,
recency_upper,
frequency_lower,
frequency_upper,
monetary_lower,
monetary_upper)
# inspect segment assignment
head(rfm_segments_baseline)
The mainstream customer segmentation is better as our own approach since it yields much less other instances (only approximately 6.3% of donors are assigned to this segment):
rfm_segments_baseline %>% ggplot(aes(segment)) +
geom_bar()
rfm_segments_baseline$segment %>% table() %>% prop.table() %>% round(2) %>% sort(decreasing = T)
## .
## Loyal Customers Champions Potential Loyalist At Risk
## 0.24 0.21 0.19 0.11
## Lost About To Sleep Others Need Attention
## 0.09 0.08 0.06 0.03
rfm_segments_baseline$segment %>% table() %>% prop.table() %>% round(3) %>% sort(decreasing = T)
## .
## Loyal Customers Champions Potential Loyalist At Risk
## 0.241 0.211 0.185 0.107
## Lost About To Sleep Others Need Attention
## 0.085 0.080 0.063 0.027
Finally, we can inspect median scores for each RFM component per segment:
rfm_plot_median_recency(rfm_segments_baseline)
rfm_plot_median_frequency(rfm_segments_baseline)
rfm_plot_median_monetary(rfm_segments_baseline)
For the clustering (no matte whethre optics, kmeans, hiearchical), we need to define a strategy to deal with missing values. We decided to take two approaches: (a) drop all records containing missing values, (b) drop the variables containing many missing values (i.e. age-related variables). Then we can compare both results.
Also, it will be necessary to dummy-code nominal variables, and to scale numeric variables. A code-snippet for these preprocessing operations implemented in the tidymodels package is provided below:
library(tidymodels)
## Registered S3 method overwritten by 'tune':
## method from
## required_pkgs.model_spec parsnip
## -- Attaching packages -------------------------------------- tidymodels 0.1.4 --
## v broom 0.7.10 v rsample 0.1.1
## v dials 0.0.10 v tune 0.1.6
## v infer 1.0.0 v workflows 0.2.4
## v modeldata 0.1.1 v workflowsets 0.1.0
## v parsnip 0.1.7 v yardstick 0.0.9
## v recipes 0.1.17
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## * Dig deeper into tidy modeling with R at https://www.tmwr.org
customers_prep <- customer_segmentation_first_prepro %>% select(-c(Ort, Postcode, generation_moniker)) %>% recipe() %>%
step_dummy(all_nominal()) %>%
step_scale(all_numeric()) %>%
prep() %>%
bake(new_data = NULL)
## Warning: There are new levels in a factor: NA